import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
pathfile = '../data/loans_ind.csv'
loans_ind = pd.read_csv(pathfile , engine = 'python')
list(loans_ind)
['term', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'loan_status', 'purpose', 'title', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'fico_avg']
Para el análisis EDA se procede a eliminar los valores NaN en vez de tratar estos valores, esto se realiza con la finalidad de no alterar el análisis.
loans_eda_ind = loans_ind.dropna()
loans_eda_ind.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 367048 entries, 0 to 460540 Data columns (total 65 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 367048 non-null float64 1 installment 367048 non-null float64 2 grade 367048 non-null object 3 emp_length 367048 non-null float64 4 home_ownership 367048 non-null object 5 annual_inc 367048 non-null float64 6 verification_status 367048 non-null object 7 loan_status 367048 non-null float64 8 purpose 367048 non-null object 9 title 367048 non-null object 10 addr_state 367048 non-null object 11 dti 367048 non-null float64 12 delinq_2yrs 367048 non-null int64 13 earliest_cr_line 367048 non-null object 14 inq_last_6mths 367048 non-null int64 15 mths_since_last_delinq 367048 non-null int64 16 open_acc 367048 non-null int64 17 pub_rec 367048 non-null int64 18 revol_bal 367048 non-null int64 19 revol_util 367048 non-null float64 20 total_acc 367048 non-null int64 21 initial_list_status 367048 non-null object 22 collections_12_mths_ex_med 367048 non-null int64 23 policy_code 367048 non-null int64 24 application_type 367048 non-null object 25 acc_now_delinq 367048 non-null int64 26 tot_coll_amt 367048 non-null int64 27 tot_cur_bal 367048 non-null int64 28 total_rev_hi_lim 367048 non-null int64 29 acc_open_past_24mths 367048 non-null int64 30 avg_cur_bal 367048 non-null float64 31 bc_open_to_buy 367048 non-null float64 32 bc_util 367048 non-null float64 33 chargeoff_within_12_mths 367048 non-null int64 34 delinq_amnt 367048 non-null int64 35 mo_sin_old_il_acct 367048 non-null float64 36 mo_sin_old_rev_tl_op 367048 non-null int64 37 mo_sin_rcnt_rev_tl_op 367048 non-null int64 38 mo_sin_rcnt_tl 367048 non-null int64 39 mort_acc 367048 non-null int64 40 mths_since_recent_bc 367048 non-null float64 41 mths_since_recent_inq 367048 non-null float64 42 num_accts_ever_120_pd 367048 non-null int64 43 num_actv_bc_tl 367048 non-null int64 44 num_actv_rev_tl 367048 non-null int64 45 num_bc_sats 367048 non-null int64 46 num_bc_tl 367048 non-null int64 47 num_il_tl 367048 non-null int64 48 num_op_rev_tl 367048 non-null int64 49 num_rev_accts 367048 non-null float64 50 num_rev_tl_bal_gt_0 367048 non-null int64 51 num_sats 367048 non-null int64 52 num_tl_120dpd_2m 367048 non-null float64 53 num_tl_30dpd 367048 non-null int64 54 num_tl_90g_dpd_24m 367048 non-null int64 55 num_tl_op_past_12m 367048 non-null int64 56 pct_tl_nvr_dlq 367048 non-null float64 57 percent_bc_gt_75 367048 non-null float64 58 pub_rec_bankruptcies 367048 non-null int64 59 tax_liens 367048 non-null int64 60 tot_hi_cred_lim 367048 non-null int64 61 total_bal_ex_mort 367048 non-null int64 62 total_bc_limit 367048 non-null int64 63 total_il_high_credit_limit 367048 non-null int64 64 fico_avg 367048 non-null float64 dtypes: float64(18), int64(38), object(9) memory usage: 184.8+ MB
Ahora contamos con 365956 observaciones.
Para poder trabajar de forma más simplificada con las variables numéricas, se agrupan de 10 en 10 para poder trabajar con ellos conjuntamente.
loans_numeric = loans_eda_ind.select_dtypes(include = ['int64', 'float64'])
loans_numeric.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 367048 entries, 0 to 460540 Data columns (total 56 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 367048 non-null float64 1 installment 367048 non-null float64 2 emp_length 367048 non-null float64 3 annual_inc 367048 non-null float64 4 loan_status 367048 non-null float64 5 dti 367048 non-null float64 6 delinq_2yrs 367048 non-null int64 7 inq_last_6mths 367048 non-null int64 8 mths_since_last_delinq 367048 non-null int64 9 open_acc 367048 non-null int64 10 pub_rec 367048 non-null int64 11 revol_bal 367048 non-null int64 12 revol_util 367048 non-null float64 13 total_acc 367048 non-null int64 14 collections_12_mths_ex_med 367048 non-null int64 15 policy_code 367048 non-null int64 16 acc_now_delinq 367048 non-null int64 17 tot_coll_amt 367048 non-null int64 18 tot_cur_bal 367048 non-null int64 19 total_rev_hi_lim 367048 non-null int64 20 acc_open_past_24mths 367048 non-null int64 21 avg_cur_bal 367048 non-null float64 22 bc_open_to_buy 367048 non-null float64 23 bc_util 367048 non-null float64 24 chargeoff_within_12_mths 367048 non-null int64 25 delinq_amnt 367048 non-null int64 26 mo_sin_old_il_acct 367048 non-null float64 27 mo_sin_old_rev_tl_op 367048 non-null int64 28 mo_sin_rcnt_rev_tl_op 367048 non-null int64 29 mo_sin_rcnt_tl 367048 non-null int64 30 mort_acc 367048 non-null int64 31 mths_since_recent_bc 367048 non-null float64 32 mths_since_recent_inq 367048 non-null float64 33 num_accts_ever_120_pd 367048 non-null int64 34 num_actv_bc_tl 367048 non-null int64 35 num_actv_rev_tl 367048 non-null int64 36 num_bc_sats 367048 non-null int64 37 num_bc_tl 367048 non-null int64 38 num_il_tl 367048 non-null int64 39 num_op_rev_tl 367048 non-null int64 40 num_rev_accts 367048 non-null float64 41 num_rev_tl_bal_gt_0 367048 non-null int64 42 num_sats 367048 non-null int64 43 num_tl_120dpd_2m 367048 non-null float64 44 num_tl_30dpd 367048 non-null int64 45 num_tl_90g_dpd_24m 367048 non-null int64 46 num_tl_op_past_12m 367048 non-null int64 47 pct_tl_nvr_dlq 367048 non-null float64 48 percent_bc_gt_75 367048 non-null float64 49 pub_rec_bankruptcies 367048 non-null int64 50 tax_liens 367048 non-null int64 51 tot_hi_cred_lim 367048 non-null int64 52 total_bal_ex_mort 367048 non-null int64 53 total_bc_limit 367048 non-null int64 54 total_il_high_credit_limit 367048 non-null int64 55 fico_avg 367048 non-null float64 dtypes: float64(18), int64(38) memory usage: 159.6 MB
#Separacion en grupos
loans_ind_1 = loans_numeric.iloc[:, 0:10]
loans_ind_2 = loans_numeric.iloc[:, 10:20]
loans_ind_3 = loans_numeric.iloc[:, 20:30]
loans_ind_4 = loans_numeric.iloc[:, 30:40]
loans_ind_5 = loans_numeric.iloc[:, 40:50]
loans_ind_6 = loans_numeric.iloc[:, 50:56]
Mediante los histogramas de cada variable en primer lugar nos fijaremos en las variables que tienen un unico resultado. Estas variables se eliminarán ya que no otorgan información al modelo.
sns.pairplot(loans_ind_1)
<seaborn.axisgrid.PairGrid at 0x168daeae648>
sns.pairplot(loans_ind_2)
<seaborn.axisgrid.PairGrid at 0x16883ff6308>
sns.pairplot(loans_ind_3)
<seaborn.axisgrid.PairGrid at 0x168a59bef88>
sns.pairplot(loans_ind_4)
<seaborn.axisgrid.PairGrid at 0x168f0d96d48>
sns.pairplot(loans_ind_5)
<seaborn.axisgrid.PairGrid at 0x169603baf48>
sns.pairplot(loans_ind_6)
<seaborn.axisgrid.PairGrid at 0x1698054d808>
Representamos las variables que se observa que pueda que no den información relevante.
sns.histplot(loans_numeric.term)
<AxesSubplot:xlabel='term', ylabel='Count'>
sns.histplot(loans_numeric.annual_inc)
<AxesSubplot:xlabel='annual_inc', ylabel='Count'>
sns.histplot(loans_numeric.delinq_2yrs)
<AxesSubplot:xlabel='delinq_2yrs', ylabel='Count'>
sns.histplot(loans_numeric.pub_rec)
<AxesSubplot:xlabel='pub_rec', ylabel='Count'>
sns.histplot(loans_numeric.collections_12_mths_ex_med)
<AxesSubplot:xlabel='collections_12_mths_ex_med', ylabel='Count'>
sns.histplot(loans_numeric.policy_code)
<AxesSubplot:xlabel='policy_code', ylabel='Count'>
sns.histplot(loans_numeric.acc_now_delinq)
<AxesSubplot:xlabel='acc_now_delinq', ylabel='Count'>
sns.histplot(loans_numeric.tot_coll_amt)
<AxesSubplot:xlabel='tot_coll_amt', ylabel='Count'>
sns.histplot(loans_numeric.total_rev_hi_lim)
<AxesSubplot:xlabel='total_rev_hi_lim', ylabel='Count'>
sns.histplot(loans_numeric.chargeoff_within_12_mths)
<AxesSubplot:xlabel='chargeoff_within_12_mths', ylabel='Count'>
sns.histplot(loans_numeric.delinq_amnt, bins = 100)
<AxesSubplot:xlabel='delinq_amnt', ylabel='Count'>
sns.histplot(loans_numeric.mort_acc)
<AxesSubplot:xlabel='mort_acc', ylabel='Count'>
sns.histplot(loans_numeric.num_accts_ever_120_pd)
<AxesSubplot:xlabel='num_accts_ever_120_pd', ylabel='Count'>
sns.histplot(loans_numeric.num_tl_120dpd_2m)
<AxesSubplot:xlabel='num_tl_120dpd_2m', ylabel='Count'>
sns.histplot(loans_numeric.num_tl_30dpd)
<AxesSubplot:xlabel='num_tl_30dpd', ylabel='Count'>
sns.histplot(loans_numeric.num_tl_90g_dpd_24m)
<AxesSubplot:xlabel='num_tl_90g_dpd_24m', ylabel='Count'>
sns.histplot(loans_numeric.tax_liens)
<AxesSubplot:xlabel='tax_liens', ylabel='Count'>
descarte_valores_unicos = ['delinq_2yrs', 'pub_rec', 'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq', 'tot_coll_amt', 'chargeoff_within_12_mths', 'delinq_amnt', 'num_accts_ever_120_pd', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'tax_liens']
loans_eda_ind = loans_eda_ind.drop(descarte_valores_unicos, axis = 1)
sns.histplot(loans_eda_ind.num_rev_tl_bal_gt_0)
<AxesSubplot:xlabel='num_rev_tl_bal_gt_0', ylabel='Count'>
loans_eda_ind.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 367048 entries, 0 to 460540 Data columns (total 52 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 367048 non-null float64 1 installment 367048 non-null float64 2 grade 367048 non-null object 3 emp_length 367048 non-null float64 4 home_ownership 367048 non-null object 5 annual_inc 367048 non-null float64 6 verification_status 367048 non-null object 7 loan_status 367048 non-null float64 8 purpose 367048 non-null object 9 title 367048 non-null object 10 addr_state 367048 non-null object 11 dti 367048 non-null float64 12 earliest_cr_line 367048 non-null object 13 inq_last_6mths 367048 non-null int64 14 mths_since_last_delinq 367048 non-null int64 15 open_acc 367048 non-null int64 16 revol_bal 367048 non-null int64 17 revol_util 367048 non-null float64 18 total_acc 367048 non-null int64 19 initial_list_status 367048 non-null object 20 application_type 367048 non-null object 21 tot_cur_bal 367048 non-null int64 22 total_rev_hi_lim 367048 non-null int64 23 acc_open_past_24mths 367048 non-null int64 24 avg_cur_bal 367048 non-null float64 25 bc_open_to_buy 367048 non-null float64 26 bc_util 367048 non-null float64 27 mo_sin_old_il_acct 367048 non-null float64 28 mo_sin_old_rev_tl_op 367048 non-null int64 29 mo_sin_rcnt_rev_tl_op 367048 non-null int64 30 mo_sin_rcnt_tl 367048 non-null int64 31 mort_acc 367048 non-null int64 32 mths_since_recent_bc 367048 non-null float64 33 mths_since_recent_inq 367048 non-null float64 34 num_actv_bc_tl 367048 non-null int64 35 num_actv_rev_tl 367048 non-null int64 36 num_bc_sats 367048 non-null int64 37 num_bc_tl 367048 non-null int64 38 num_il_tl 367048 non-null int64 39 num_op_rev_tl 367048 non-null int64 40 num_rev_accts 367048 non-null float64 41 num_rev_tl_bal_gt_0 367048 non-null int64 42 num_sats 367048 non-null int64 43 num_tl_op_past_12m 367048 non-null int64 44 pct_tl_nvr_dlq 367048 non-null float64 45 percent_bc_gt_75 367048 non-null float64 46 pub_rec_bankruptcies 367048 non-null int64 47 tot_hi_cred_lim 367048 non-null int64 48 total_bal_ex_mort 367048 non-null int64 49 total_bc_limit 367048 non-null int64 50 total_il_high_credit_limit 367048 non-null int64 51 fico_avg 367048 non-null float64 dtypes: float64(17), int64(26), object(9) memory usage: 148.4+ MB
loans_numeric = loans_eda_ind.select_dtypes(include = ['int64', 'float64'])
loans_numeric.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 367048 entries, 0 to 460540 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 367048 non-null float64 1 installment 367048 non-null float64 2 emp_length 367048 non-null float64 3 annual_inc 367048 non-null float64 4 loan_status 367048 non-null float64 5 dti 367048 non-null float64 6 inq_last_6mths 367048 non-null int64 7 mths_since_last_delinq 367048 non-null int64 8 open_acc 367048 non-null int64 9 revol_bal 367048 non-null int64 10 revol_util 367048 non-null float64 11 total_acc 367048 non-null int64 12 tot_cur_bal 367048 non-null int64 13 total_rev_hi_lim 367048 non-null int64 14 acc_open_past_24mths 367048 non-null int64 15 avg_cur_bal 367048 non-null float64 16 bc_open_to_buy 367048 non-null float64 17 bc_util 367048 non-null float64 18 mo_sin_old_il_acct 367048 non-null float64 19 mo_sin_old_rev_tl_op 367048 non-null int64 20 mo_sin_rcnt_rev_tl_op 367048 non-null int64 21 mo_sin_rcnt_tl 367048 non-null int64 22 mort_acc 367048 non-null int64 23 mths_since_recent_bc 367048 non-null float64 24 mths_since_recent_inq 367048 non-null float64 25 num_actv_bc_tl 367048 non-null int64 26 num_actv_rev_tl 367048 non-null int64 27 num_bc_sats 367048 non-null int64 28 num_bc_tl 367048 non-null int64 29 num_il_tl 367048 non-null int64 30 num_op_rev_tl 367048 non-null int64 31 num_rev_accts 367048 non-null float64 32 num_rev_tl_bal_gt_0 367048 non-null int64 33 num_sats 367048 non-null int64 34 num_tl_op_past_12m 367048 non-null int64 35 pct_tl_nvr_dlq 367048 non-null float64 36 percent_bc_gt_75 367048 non-null float64 37 pub_rec_bankruptcies 367048 non-null int64 38 tot_hi_cred_lim 367048 non-null int64 39 total_bal_ex_mort 367048 non-null int64 40 total_bc_limit 367048 non-null int64 41 total_il_high_credit_limit 367048 non-null int64 42 fico_avg 367048 non-null float64 dtypes: float64(17), int64(26) memory usage: 123.2 MB
corr = loans_numeric.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
cmap = sns.diverging_palette(222, 222, as_cmap = True)
f, ax = plt.subplots(figsize = (11, 9))
sns.heatmap(corr, mask = mask, cmap = cmap, center = 0, square = True, linewidths = .5)
<AxesSubplot:>
loans_numeric1 = loans_numeric.iloc[:, 25:43]
corr1 = loans_numeric1.corr()
mask = np.triu(np.ones_like(corr1, dtype = bool))
cmap = sns.diverging_palette(222, 222, as_cmap = True)
f, ax = plt.subplots(figsize = (11, 9))
sns.heatmap(corr1, mask = mask, cmap = cmap, center = 0, square = True, linewidths = .5)
<AxesSubplot:>
Observando la matriz de correlaciones y el diccionario de las variables, se observa una alta correlación entre num_actv_bc_tl, num_bc_tl y num_bc_sats, además informan practicamente de lo mismo, por lo que es redundante mantenerlas todas, eliminaremos num_bc_tl y num_bc_sats.
Las variables num_actv_rev_tl y num_rev_tl_bal_gt_0 tienen una muy alta correlación entre ellas, esto es ya que ambas explican lo mismo (cuentas revolving activas), por ello se procede a eliminar la segunda variable.
Entre las variables num_op_rev_tl y num_rev_accts tienen alta correlación, además la variable num_rev_accts (número de cuentas revolving) contiene información de num_op_rev_tl (número de cuentas revolving activas), por lo que se decide mantener la primera ya que contiene mayor información.
Por último, entre num_bc_sats y num_bc_tl existe alta correlación e interpretando ambas variables se decide mantener num_bc_tl al contener mayor información.
descarte_cor = ['num_bc_tl', 'num_bc_sats', 'num_rev_tl_bal_gt_0', 'num_op_rev_tl', 'num_bc_sats']
loans_eda_ind = loans_eda_ind.drop(descarte_cor, axis = 1)
loans_eda_ind.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 367048 entries, 0 to 460540 Data columns (total 48 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 367048 non-null float64 1 installment 367048 non-null float64 2 grade 367048 non-null object 3 emp_length 367048 non-null float64 4 home_ownership 367048 non-null object 5 annual_inc 367048 non-null float64 6 verification_status 367048 non-null object 7 loan_status 367048 non-null float64 8 purpose 367048 non-null object 9 title 367048 non-null object 10 addr_state 367048 non-null object 11 dti 367048 non-null float64 12 earliest_cr_line 367048 non-null object 13 inq_last_6mths 367048 non-null int64 14 mths_since_last_delinq 367048 non-null int64 15 open_acc 367048 non-null int64 16 revol_bal 367048 non-null int64 17 revol_util 367048 non-null float64 18 total_acc 367048 non-null int64 19 initial_list_status 367048 non-null object 20 application_type 367048 non-null object 21 tot_cur_bal 367048 non-null int64 22 total_rev_hi_lim 367048 non-null int64 23 acc_open_past_24mths 367048 non-null int64 24 avg_cur_bal 367048 non-null float64 25 bc_open_to_buy 367048 non-null float64 26 bc_util 367048 non-null float64 27 mo_sin_old_il_acct 367048 non-null float64 28 mo_sin_old_rev_tl_op 367048 non-null int64 29 mo_sin_rcnt_rev_tl_op 367048 non-null int64 30 mo_sin_rcnt_tl 367048 non-null int64 31 mort_acc 367048 non-null int64 32 mths_since_recent_bc 367048 non-null float64 33 mths_since_recent_inq 367048 non-null float64 34 num_actv_bc_tl 367048 non-null int64 35 num_actv_rev_tl 367048 non-null int64 36 num_il_tl 367048 non-null int64 37 num_rev_accts 367048 non-null float64 38 num_sats 367048 non-null int64 39 num_tl_op_past_12m 367048 non-null int64 40 pct_tl_nvr_dlq 367048 non-null float64 41 percent_bc_gt_75 367048 non-null float64 42 pub_rec_bankruptcies 367048 non-null int64 43 tot_hi_cred_lim 367048 non-null int64 44 total_bal_ex_mort 367048 non-null int64 45 total_bc_limit 367048 non-null int64 46 total_il_high_credit_limit 367048 non-null int64 47 fico_avg 367048 non-null float64 dtypes: float64(17), int64(22), object(9) memory usage: 137.2+ MB
Para guardar el dataset con las variables óptimas para el modelo, se realizará un drop sobre el dataset inicial con todas las variables anteriormente descartadas. Esto es debido a que el dataset loans_eda_ind no cuenta con valores Na's.
loans_ind = loans_ind.drop(descarte_cor, axis = 1)
loans_ind = loans_ind.drop(descarte_valores_unicos, axis = 1)
loans_ind.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 460541 entries, 0 to 460540 Data columns (total 48 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 460541 non-null float64 1 installment 460541 non-null float64 2 grade 460541 non-null object 3 emp_length 435708 non-null float64 4 home_ownership 460541 non-null object 5 annual_inc 460541 non-null float64 6 verification_status 460541 non-null object 7 loan_status 460541 non-null float64 8 purpose 460541 non-null object 9 title 460484 non-null object 10 addr_state 460541 non-null object 11 dti 460541 non-null float64 12 earliest_cr_line 460541 non-null object 13 inq_last_6mths 460541 non-null int64 14 mths_since_last_delinq 460541 non-null int64 15 open_acc 460541 non-null int64 16 revol_bal 460541 non-null int64 17 revol_util 460309 non-null float64 18 total_acc 460541 non-null int64 19 initial_list_status 460541 non-null object 20 application_type 460541 non-null object 21 tot_cur_bal 460541 non-null int64 22 total_rev_hi_lim 460541 non-null int64 23 acc_open_past_24mths 460541 non-null int64 24 avg_cur_bal 460535 non-null float64 25 bc_open_to_buy 455698 non-null float64 26 bc_util 455373 non-null float64 27 mo_sin_old_il_acct 447350 non-null float64 28 mo_sin_old_rev_tl_op 460541 non-null int64 29 mo_sin_rcnt_rev_tl_op 460541 non-null int64 30 mo_sin_rcnt_tl 460541 non-null int64 31 mort_acc 460541 non-null int64 32 mths_since_recent_bc 456002 non-null float64 33 mths_since_recent_inq 420213 non-null float64 34 num_actv_bc_tl 460541 non-null int64 35 num_actv_rev_tl 460541 non-null int64 36 num_il_tl 460541 non-null int64 37 num_rev_accts 460540 non-null float64 38 num_sats 460541 non-null int64 39 num_tl_op_past_12m 460541 non-null int64 40 pct_tl_nvr_dlq 460541 non-null float64 41 percent_bc_gt_75 455428 non-null float64 42 pub_rec_bankruptcies 460541 non-null int64 43 tot_hi_cred_lim 460541 non-null int64 44 total_bal_ex_mort 460541 non-null int64 45 total_bc_limit 460541 non-null int64 46 total_il_high_credit_limit 460541 non-null int64 47 fico_avg 460541 non-null float64 dtypes: float64(17), int64(22), object(9) memory usage: 168.7+ MB
loans_ind.to_csv('../data/loans_ind_clean.csv', index = False)
Observando la escasa variabilidad de la mayoria de los valores, consideramos que la variable no se debe tener en cuenta
loans_eda_ind.num_actv_bc_tl.describe()
count 367048.000000 mean 3.685466 std 2.170491 min 0.000000 25% 2.000000 50% 3.000000 75% 5.000000 max 26.000000 Name: num_actv_bc_tl, dtype: float64
loans_eda_ind.num_actv_rev_tl.describe()
count 367048.000000 mean 5.756544 std 3.202521 min 0.000000 25% 4.000000 50% 5.000000 75% 7.000000 max 42.000000 Name: num_actv_rev_tl, dtype: float64
fig, ax = plt.subplots(nrows = 1, figsize = (25, 5))
ax = sns.lineplot(x = 'annual_inc',
y = 'num_actv_bc_tl',
data = loans_eda_ind,
label = '%')
ax = sns.lineplot(x = 'annual_inc',
y = 'num_actv_rev_tl',
data = loans_eda_ind,
label = 'ee')
sns.displot(loans_eda_ind,
x = loans_eda_ind['bc_util'])
<seaborn.axisgrid.FacetGrid at 0x2aaa49e8be0>
Carga de base de datos joint
pathfile5 = '../data/loans_joint.csv'
loans_joint = pd.read_csv(pathfile5 , engine = 'python')
loans_joint
| term | installment | grade | emp_length | home_ownership | annual_inc | verification_status | loan_status | desc | purpose | ... | sec_app_inq_last_6mths | sec_app_mort_acc | sec_app_open_acc | sec_app_revol_util | sec_app_open_act_il | sec_app_num_rev_accts | sec_app_chargeoff_within_12_mths | sec_app_collections_12_mths_ex_med | sec_app_mths_since_last_major_derog | fico_avg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 months | 486.98 | B | 3 years | MORTGAGE | 115000.0 | Not Verified | 1.0 | NaN | car | ... | 0.0 | 2.0 | 8.0 | 7.7 | 0.0 | 27.0 | 0.0 | 0.0 | 57.0 | 682.0 |
| 1 | 36 months | 112.83 | B | 3 years | MORTGAGE | 40000.0 | Verified | 1.0 | NaN | debt_consolidation | ... | 0.0 | 2.0 | 18.0 | 50.0 | 3.0 | 20.0 | 0.0 | 0.0 | 78.0 | 672.0 |
| 2 | 36 months | 285.70 | E | < 1 year | RENT | 24000.0 | Not Verified | 0.0 | NaN | other | ... | 0.0 | 0.0 | 23.0 | NaN | 23.0 | 2.0 | 1.0 | 3.0 | 5.0 | 662.0 |
| 3 | 60 months | 437.92 | C | 5 years | MORTGAGE | 55000.0 | Source Verified | 1.0 | NaN | home_improvement | ... | 2.0 | 3.0 | 23.0 | 42.1 | 13.0 | 13.0 | 0.0 | 2.0 | 29.0 | 697.0 |
| 4 | 36 months | 817.41 | B | NaN | MORTGAGE | 32000.0 | Verified | 0.0 | NaN | credit_card | ... | 0.0 | 2.0 | 14.0 | 76.1 | 2.0 | 24.0 | 0.0 | 0.0 | 50.0 | 692.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1645 | 60 months | 390.68 | E | 2 years | RENT | 26000.0 | Verified | 0.0 | NaN | debt_consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 682.0 |
| 1646 | 60 months | 482.23 | D | 9 years | MORTGAGE | 65000.0 | Verified | 1.0 | NaN | debt_consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 697.0 |
| 1647 | 60 months | 1006.68 | F | 10+ years | RENT | 61140.0 | Verified | 1.0 | NaN | debt_consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 667.0 |
| 1648 | 60 months | 369.99 | E | 6 years | MORTGAGE | 120000.0 | Source Verified | 0.0 | NaN | debt_consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 702.0 |
| 1649 | 36 months | 395.89 | B | 6 years | MORTGAGE | 48000.0 | Verified | 1.0 | NaN | debt_consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 707.0 |
1650 rows × 100 columns
((loans_joint.iloc[:, 0:40].isnull().sum() / len(loans_joint))*100).sort_values(ascending = False)
desc 100.000000 mths_since_last_record 80.060606 mths_since_last_major_derog 73.878788 mths_since_last_delinq 46.787879 emp_length 15.333333 mths_since_rcnt_il 12.000000 total_bal_il 10.121212 open_il_24m 10.121212 open_il_12m 10.121212 open_acc_6m 10.121212 open_act_il 10.121212 dti 1.272727 title 0.363636 revol_util 0.121212 grade 0.000000 addr_state 0.000000 delinq_2yrs 0.000000 purpose 0.000000 installment 0.000000 earliest_cr_line 0.000000 verification_status 0.000000 annual_inc 0.000000 home_ownership 0.000000 loan_status 0.000000 pub_rec 0.000000 inq_last_6mths 0.000000 open_acc 0.000000 revol_bal 0.000000 total_acc 0.000000 initial_list_status 0.000000 collections_12_mths_ex_med 0.000000 policy_code 0.000000 application_type 0.000000 annual_inc_joint 0.000000 dti_joint 0.000000 verification_status_joint 0.000000 acc_now_delinq 0.000000 tot_coll_amt 0.000000 tot_cur_bal 0.000000 term 0.000000 dtype: float64
((loans_joint.iloc[:, 40:80].isnull().sum() / len(loans_joint))*100).sort_values(ascending = False)
mths_since_recent_bc_dlq 76.060606 mths_since_recent_revol_delinq 64.727273 il_util 24.242424 all_util 10.181818 open_rv_12m 10.121212 open_rv_24m 10.121212 max_bal_bc 10.121212 inq_fi 10.121212 total_cu_tl 10.121212 inq_last_12m 10.121212 mths_since_recent_inq 8.727273 bc_open_to_buy 2.727273 bc_util 2.727273 mths_since_recent_bc 2.727273 percent_bc_gt_75 2.727273 mo_sin_old_il_acct 2.000000 num_accts_ever_120_pd 0.000000 num_op_rev_tl 0.000000 num_tl_op_past_12m 0.000000 num_tl_90g_dpd_24m 0.000000 num_tl_30dpd 0.000000 num_tl_120dpd_2m 0.000000 total_rev_hi_lim 0.000000 num_sats 0.000000 num_rev_tl_bal_gt_0 0.000000 num_rev_accts 0.000000 acc_open_past_24mths 0.000000 avg_cur_bal 0.000000 num_il_tl 0.000000 num_actv_bc_tl 0.000000 chargeoff_within_12_mths 0.000000 delinq_amnt 0.000000 mo_sin_old_rev_tl_op 0.000000 mo_sin_rcnt_rev_tl_op 0.000000 mo_sin_rcnt_tl 0.000000 pct_tl_nvr_dlq 0.000000 num_bc_tl 0.000000 num_bc_sats 0.000000 num_actv_rev_tl 0.000000 mort_acc 0.000000 dtype: float64
((loans_joint.iloc[:, 80:100].isnull().sum() / len(loans_joint))*100).sort_values(ascending = False)
sec_app_mths_since_last_major_derog 65.575758 sec_app_revol_util 16.969697 sec_app_earliest_cr_line 14.909091 sec_app_collections_12_mths_ex_med 14.909091 sec_app_chargeoff_within_12_mths 14.909091 sec_app_num_rev_accts 14.909091 sec_app_open_act_il 14.909091 sec_app_open_acc 14.909091 sec_app_mort_acc 14.909091 sec_app_inq_last_6mths 14.909091 sec_app_fico_range_high 14.909091 sec_app_fico_range_low 14.909091 revol_bal_joint 14.909091 total_bal_ex_mort 0.000000 tax_liens 0.000000 tot_hi_cred_lim 0.000000 fico_avg 0.000000 total_bc_limit 0.000000 total_il_high_credit_limit 0.000000 pub_rec_bankruptcies 0.000000 dtype: float64
Eliminamos las variables que poseen más de un 50% de valores na.
loans_descartes_joint_na = loans_joint.loc[:,['sec_app_mths_since_last_major_derog', 'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq', 'desc', 'mths_since_last_record', 'mths_since_last_record', 'mths_since_last_major_derog']]
loans_joint = loans_joint.drop(loans_descartes_joint_na, axis = 1)
loans_joint
| term | installment | grade | emp_length | home_ownership | annual_inc | verification_status | loan_status | purpose | title | ... | sec_app_earliest_cr_line | sec_app_inq_last_6mths | sec_app_mort_acc | sec_app_open_acc | sec_app_revol_util | sec_app_open_act_il | sec_app_num_rev_accts | sec_app_chargeoff_within_12_mths | sec_app_collections_12_mths_ex_med | fico_avg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 months | 486.98 | B | 3 years | MORTGAGE | 115000.0 | Not Verified | 1.0 | car | Car financing | ... | Jul-1995 | 0.0 | 2.0 | 8.0 | 7.7 | 0.0 | 27.0 | 0.0 | 0.0 | 682.0 |
| 1 | 36 months | 112.83 | B | 3 years | MORTGAGE | 40000.0 | Verified | 1.0 | debt_consolidation | Debt consolidation | ... | Jan-2007 | 0.0 | 2.0 | 18.0 | 50.0 | 3.0 | 20.0 | 0.0 | 0.0 | 672.0 |
| 2 | 36 months | 285.70 | E | < 1 year | RENT | 24000.0 | Not Verified | 0.0 | other | Other | ... | Sep-2005 | 0.0 | 0.0 | 23.0 | NaN | 23.0 | 2.0 | 1.0 | 3.0 | 662.0 |
| 3 | 60 months | 437.92 | C | 5 years | MORTGAGE | 55000.0 | Source Verified | 1.0 | home_improvement | Home improvement | ... | Jun-2006 | 2.0 | 3.0 | 23.0 | 42.1 | 13.0 | 13.0 | 0.0 | 2.0 | 697.0 |
| 4 | 36 months | 817.41 | B | NaN | MORTGAGE | 32000.0 | Verified | 0.0 | credit_card | Credit card refinancing | ... | Oct-1997 | 0.0 | 2.0 | 14.0 | 76.1 | 2.0 | 24.0 | 0.0 | 0.0 | 692.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1645 | 60 months | 390.68 | E | 2 years | RENT | 26000.0 | Verified | 0.0 | debt_consolidation | Debt consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 682.0 |
| 1646 | 60 months | 482.23 | D | 9 years | MORTGAGE | 65000.0 | Verified | 1.0 | debt_consolidation | Debt consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 697.0 |
| 1647 | 60 months | 1006.68 | F | 10+ years | RENT | 61140.0 | Verified | 1.0 | debt_consolidation | Debt consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 667.0 |
| 1648 | 60 months | 369.99 | E | 6 years | MORTGAGE | 120000.0 | Source Verified | 0.0 | debt_consolidation | Debt consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 702.0 |
| 1649 | 36 months | 395.89 | B | 6 years | MORTGAGE | 48000.0 | Verified | 1.0 | debt_consolidation | Debt consolidation | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 707.0 |
1650 rows × 94 columns
Buscamos las variables categoricas dentro de las variables que nos quedan joint
Pasamos categóricas a numéricas
categorical_features = loans_joint.select_dtypes(include = ['object'])
categorical_features
| term | grade | emp_length | home_ownership | verification_status | purpose | title | addr_state | earliest_cr_line | revol_util | initial_list_status | application_type | verification_status_joint | sec_app_earliest_cr_line | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 months | B | 3 years | MORTGAGE | Not Verified | car | Car financing | PA | Jul-1995 | 8.1% | w | Joint App | Not Verified | Jul-1995 |
| 1 | 36 months | B | 3 years | MORTGAGE | Verified | debt_consolidation | Debt consolidation | FL | Aug-2003 | 46.8% | w | Joint App | Verified | Jan-2007 |
| 2 | 36 months | E | < 1 year | RENT | Not Verified | other | Other | VA | Oct-2007 | 52.9% | w | Joint App | Not Verified | Sep-2005 |
| 3 | 60 months | C | 5 years | MORTGAGE | Source Verified | home_improvement | Home improvement | PA | Dec-2003 | 45.7% | w | Joint App | Source Verified | Jun-2006 |
| 4 | 36 months | B | NaN | MORTGAGE | Verified | credit_card | Credit card refinancing | AZ | Jan-2004 | 74.3% | f | Joint App | Verified | Oct-1997 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1645 | 60 months | E | 2 years | RENT | Verified | debt_consolidation | Debt consolidation | MO | Aug-2007 | 83.5% | w | Joint App | Not Verified | NaN |
| 1646 | 60 months | D | 9 years | MORTGAGE | Verified | debt_consolidation | Debt consolidation | IL | Sep-2004 | 51.7% | f | Joint App | Not Verified | NaN |
| 1647 | 60 months | F | 10+ years | RENT | Verified | debt_consolidation | Debt consolidation | IL | Apr-2005 | 68.2% | w | Joint App | Not Verified | NaN |
| 1648 | 60 months | E | 6 years | MORTGAGE | Source Verified | debt_consolidation | Debt consolidation | FL | Apr-2005 | 59.6% | f | Joint App | Not Verified | NaN |
| 1649 | 36 months | B | 6 years | MORTGAGE | Verified | debt_consolidation | Debt consolidation | FL | Jan-2006 | 24.4% | f | Joint App | Not Verified | NaN |
1650 rows × 14 columns
split_term = map(lambda x: x.split(' ')[1], loans_joint.term)
loans_joint.term = list(split_term)
loans_joint['term'] = loans_joint['term'].astype(float)
loans_joint['revol_util'] = loans_joint.revol_util.str.strip('%').astype(float)
loans_joint['earliest_cr_line'] = pd.to_datetime(loans_joint['earliest_cr_line'])
loans_joint['emp_length'] = loans_joint.emp_length.str.strip('<')
loans_joint['emp_length'] = loans_joint.emp_length.str.strip('+ years')
loans_joint['emp_length'] = loans_joint['emp_length'].astype('float')
loans_joint['emp_length']
0 3.0
1 3.0
2 1.0
3 5.0
4 NaN
...
1645 2.0
1646 9.0
1647 10.0
1648 6.0
1649 6.0
Name: emp_length, Length: 1650, dtype: float64
Para la variable mths_since_last_delinq, que indica los meses que han pasado desde la última morisidad del solicitante, se observa un 48 % de valores nulos, que se consideran asociados a que ese solicitante no ha tenido ninguna incidencia de pagos en su historial crediticio. A raiz de esta suposición, se considera licita la aplicación de un baremo que transforme la variable a categórica. Esta variable estará situada entre 0 y 10, donde 0 significa que no ha tenido ninguna incidencia, 1 que los meses transcurridos desde su útlima incidencia es superior a X meses, 2 entre X ...
loans_joint['mths_since_last_delinq'].describe()
count 878.000000 mean 32.534169 std 21.498600 min 0.000000 25% 14.000000 50% 29.000000 75% 47.000000 max 107.000000 Name: mths_since_last_delinq, dtype: float64
sns.boxplot(y = 'mths_since_last_delinq', data = loans_joint)
<AxesSubplot:ylabel='mths_since_last_delinq'>
def delinq_range(x):
if x <= 12:
return 10
elif x <= 24:
return 9
elif x <= 36:
return 8
elif x <= 48:
return 7
elif x <= 60:
return 6
elif x <= 72:
return 5
elif x <= 84:
return 4
elif x <= 96:
return 3
elif x <= 108:
return 2
elif x > 108:
return 1
else:
return 0
loans_joint['mths_since_last_delinq'] = loans_joint.mths_since_last_delinq.map(delinq_range)
loans_joint['mths_since_last_delinq']
0 6
1 0
2 10
3 0
4 9
..
1645 0
1646 0
1647 0
1648 9
1649 0
Name: mths_since_last_delinq, Length: 1650, dtype: int64
loans_joint['mths_since_last_delinq'].isnull().sum()
0
loans_numeric_joint = loans_joint.select_dtypes(include = ['int64', 'float64'])
loans_numeric_joint.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1650 entries, 0 to 1649 Data columns (total 83 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 1650 non-null float64 1 installment 1650 non-null float64 2 emp_length 1397 non-null float64 3 annual_inc 1650 non-null float64 4 loan_status 1650 non-null float64 5 dti 1629 non-null float64 6 delinq_2yrs 1650 non-null int64 7 inq_last_6mths 1650 non-null int64 8 mths_since_last_delinq 1650 non-null int64 9 open_acc 1650 non-null int64 10 pub_rec 1650 non-null int64 11 revol_bal 1650 non-null int64 12 revol_util 1648 non-null float64 13 total_acc 1650 non-null int64 14 collections_12_mths_ex_med 1650 non-null int64 15 policy_code 1650 non-null int64 16 annual_inc_joint 1650 non-null float64 17 dti_joint 1650 non-null float64 18 acc_now_delinq 1650 non-null int64 19 tot_coll_amt 1650 non-null int64 20 tot_cur_bal 1650 non-null int64 21 open_acc_6m 1483 non-null float64 22 open_act_il 1483 non-null float64 23 open_il_12m 1483 non-null float64 24 open_il_24m 1483 non-null float64 25 mths_since_rcnt_il 1452 non-null float64 26 total_bal_il 1483 non-null float64 27 il_util 1250 non-null float64 28 open_rv_12m 1483 non-null float64 29 open_rv_24m 1483 non-null float64 30 max_bal_bc 1483 non-null float64 31 all_util 1482 non-null float64 32 total_rev_hi_lim 1650 non-null int64 33 inq_fi 1483 non-null float64 34 total_cu_tl 1483 non-null float64 35 inq_last_12m 1483 non-null float64 36 acc_open_past_24mths 1650 non-null int64 37 avg_cur_bal 1650 non-null float64 38 bc_open_to_buy 1605 non-null float64 39 bc_util 1605 non-null float64 40 chargeoff_within_12_mths 1650 non-null int64 41 delinq_amnt 1650 non-null int64 42 mo_sin_old_il_acct 1617 non-null float64 43 mo_sin_old_rev_tl_op 1650 non-null int64 44 mo_sin_rcnt_rev_tl_op 1650 non-null int64 45 mo_sin_rcnt_tl 1650 non-null int64 46 mort_acc 1650 non-null int64 47 mths_since_recent_bc 1605 non-null float64 48 mths_since_recent_inq 1506 non-null float64 49 num_accts_ever_120_pd 1650 non-null int64 50 num_actv_bc_tl 1650 non-null int64 51 num_actv_rev_tl 1650 non-null int64 52 num_bc_sats 1650 non-null int64 53 num_bc_tl 1650 non-null int64 54 num_il_tl 1650 non-null int64 55 num_op_rev_tl 1650 non-null int64 56 num_rev_accts 1650 non-null float64 57 num_rev_tl_bal_gt_0 1650 non-null int64 58 num_sats 1650 non-null int64 59 num_tl_120dpd_2m 1650 non-null float64 60 num_tl_30dpd 1650 non-null int64 61 num_tl_90g_dpd_24m 1650 non-null int64 62 num_tl_op_past_12m 1650 non-null int64 63 pct_tl_nvr_dlq 1650 non-null float64 64 percent_bc_gt_75 1605 non-null float64 65 pub_rec_bankruptcies 1650 non-null int64 66 tax_liens 1650 non-null int64 67 tot_hi_cred_lim 1650 non-null int64 68 total_bal_ex_mort 1650 non-null int64 69 total_bc_limit 1650 non-null int64 70 total_il_high_credit_limit 1650 non-null int64 71 revol_bal_joint 1404 non-null float64 72 sec_app_fico_range_low 1404 non-null float64 73 sec_app_fico_range_high 1404 non-null float64 74 sec_app_inq_last_6mths 1404 non-null float64 75 sec_app_mort_acc 1404 non-null float64 76 sec_app_open_acc 1404 non-null float64 77 sec_app_revol_util 1370 non-null float64 78 sec_app_open_act_il 1404 non-null float64 79 sec_app_num_rev_accts 1404 non-null float64 80 sec_app_chargeoff_within_12_mths 1404 non-null float64 81 sec_app_collections_12_mths_ex_med 1404 non-null float64 82 fico_avg 1650 non-null float64 dtypes: float64(45), int64(38) memory usage: 1.0 MB
loans_joint_1 = loans_numeric_joint.iloc[:, 0:10]
loans_joint_2 = loans_numeric_joint.iloc[:, 10:20]
loans_joint_3 = loans_numeric_joint.iloc[:, 20:30]
loans_joint_4 = loans_numeric_joint.iloc[:, 30:40]
loans_joint_5 = loans_numeric_joint.iloc[:, 40:50]
loans_joint_6 = loans_numeric_joint.iloc[:, 50:60]
loans_joint_7 = loans_numeric_joint.iloc[:, 60:70]
loans_joint_8 = loans_numeric_joint.iloc[:, 70:81]
sns.pairplot(loans_joint_1)
<seaborn.axisgrid.PairGrid at 0x1b42d8a3e80>
sns.pairplot(loans_joint_2)
<seaborn.axisgrid.PairGrid at 0x1b4034b8ca0>
sns.pairplot(loans_joint_3)
<seaborn.axisgrid.PairGrid at 0x1b40abcfeb0>
sns.pairplot(loans_joint_4)
<seaborn.axisgrid.PairGrid at 0x1b406d31fa0>
sns.pairplot(loans_joint_5)
<seaborn.axisgrid.PairGrid at 0x1b40bdf3e20>
sns.pairplot(loans_joint_6)
<seaborn.axisgrid.PairGrid at 0x1b4103ae4c0>
sns.pairplot(loans_joint_7)
<seaborn.axisgrid.PairGrid at 0x1b41522d1f0>
sns.pairplot(loans_joint_8)
<seaborn.axisgrid.PairGrid at 0x1b4150ebf10>
sns.histplot(loans_joint.collections_12_mths_ex_med)
<AxesSubplot:xlabel='collections_12_mths_ex_med', ylabel='Count'>
sns.histplot(loans_joint.policy_code)
<AxesSubplot:xlabel='policy_code', ylabel='Count'>
sns.histplot(loans_joint.acc_now_delinq)
<AxesSubplot:xlabel='acc_now_delinq', ylabel='Count'>
sns.histplot(loans_joint.tot_coll_amt)
<AxesSubplot:xlabel='tot_coll_amt', ylabel='Count'>
sns.histplot(loans_joint.chargeoff_within_12_mths)
<AxesSubplot:xlabel='chargeoff_within_12_mths', ylabel='Count'>
sns.histplot(loans_joint.delinq_amnt)
<AxesSubplot:xlabel='delinq_amnt', ylabel='Count'>
sns.histplot(loans_joint.num_tl_120dpd_2m)
<AxesSubplot:xlabel='num_tl_120dpd_2m', ylabel='Count'>
sns.histplot(loans_joint.num_tl_30dpd)
<AxesSubplot:xlabel='num_tl_30dpd', ylabel='Count'>
sns.histplot(loans_joint.num_tl_90g_dpd_24m)
<AxesSubplot:xlabel='num_tl_90g_dpd_24m', ylabel='Count'>
Comprobando los escasos valores que nos aportan todas las variables representadas en los anteriores histogramas optamos por su eliminación.
loans_joint_descartes_2 = loans_joint.loc[:, ['collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq', 'tot_coll_amt', 'num_tl_30dpd', 'num_tl_90g_dpd_24m']]
loans_joint = loans_joint.drop(loans_joint_descartes_2, axis = 1)
list(loans_joint)
['term', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'loan_status', 'purpose', 'title', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'fico_avg']
loans_numeric_joint = loans_joint.select_dtypes(include = ['int64', 'float64'])
loans_numeric_joint.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1650 entries, 0 to 1649 Data columns (total 77 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 term 1650 non-null float64 1 installment 1650 non-null float64 2 emp_length 1397 non-null float64 3 annual_inc 1650 non-null float64 4 loan_status 1650 non-null float64 5 dti 1629 non-null float64 6 delinq_2yrs 1650 non-null int64 7 inq_last_6mths 1650 non-null int64 8 mths_since_last_delinq 1650 non-null int64 9 open_acc 1650 non-null int64 10 pub_rec 1650 non-null int64 11 revol_bal 1650 non-null int64 12 revol_util 1648 non-null float64 13 total_acc 1650 non-null int64 14 annual_inc_joint 1650 non-null float64 15 dti_joint 1650 non-null float64 16 tot_cur_bal 1650 non-null int64 17 open_acc_6m 1483 non-null float64 18 open_act_il 1483 non-null float64 19 open_il_12m 1483 non-null float64 20 open_il_24m 1483 non-null float64 21 mths_since_rcnt_il 1452 non-null float64 22 total_bal_il 1483 non-null float64 23 il_util 1250 non-null float64 24 open_rv_12m 1483 non-null float64 25 open_rv_24m 1483 non-null float64 26 max_bal_bc 1483 non-null float64 27 all_util 1482 non-null float64 28 total_rev_hi_lim 1650 non-null int64 29 inq_fi 1483 non-null float64 30 total_cu_tl 1483 non-null float64 31 inq_last_12m 1483 non-null float64 32 acc_open_past_24mths 1650 non-null int64 33 avg_cur_bal 1650 non-null float64 34 bc_open_to_buy 1605 non-null float64 35 bc_util 1605 non-null float64 36 chargeoff_within_12_mths 1650 non-null int64 37 delinq_amnt 1650 non-null int64 38 mo_sin_old_il_acct 1617 non-null float64 39 mo_sin_old_rev_tl_op 1650 non-null int64 40 mo_sin_rcnt_rev_tl_op 1650 non-null int64 41 mo_sin_rcnt_tl 1650 non-null int64 42 mort_acc 1650 non-null int64 43 mths_since_recent_bc 1605 non-null float64 44 mths_since_recent_inq 1506 non-null float64 45 num_accts_ever_120_pd 1650 non-null int64 46 num_actv_bc_tl 1650 non-null int64 47 num_actv_rev_tl 1650 non-null int64 48 num_bc_sats 1650 non-null int64 49 num_bc_tl 1650 non-null int64 50 num_il_tl 1650 non-null int64 51 num_op_rev_tl 1650 non-null int64 52 num_rev_accts 1650 non-null float64 53 num_rev_tl_bal_gt_0 1650 non-null int64 54 num_sats 1650 non-null int64 55 num_tl_120dpd_2m 1650 non-null float64 56 num_tl_op_past_12m 1650 non-null int64 57 pct_tl_nvr_dlq 1650 non-null float64 58 percent_bc_gt_75 1605 non-null float64 59 pub_rec_bankruptcies 1650 non-null int64 60 tax_liens 1650 non-null int64 61 tot_hi_cred_lim 1650 non-null int64 62 total_bal_ex_mort 1650 non-null int64 63 total_bc_limit 1650 non-null int64 64 total_il_high_credit_limit 1650 non-null int64 65 revol_bal_joint 1404 non-null float64 66 sec_app_fico_range_low 1404 non-null float64 67 sec_app_fico_range_high 1404 non-null float64 68 sec_app_inq_last_6mths 1404 non-null float64 69 sec_app_mort_acc 1404 non-null float64 70 sec_app_open_acc 1404 non-null float64 71 sec_app_revol_util 1370 non-null float64 72 sec_app_open_act_il 1404 non-null float64 73 sec_app_num_rev_accts 1404 non-null float64 74 sec_app_chargeoff_within_12_mths 1404 non-null float64 75 sec_app_collections_12_mths_ex_med 1404 non-null float64 76 fico_avg 1650 non-null float64 dtypes: float64(45), int64(32) memory usage: 992.7 KB
corr = loans_joint.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
cmap = sns.diverging_palette(222, 222, as_cmap = True)
f, ax = plt.subplots(figsize = (11, 9))
sns.heatmap(corr, mask = mask, cmap = cmap, center = 0, square = True, linewidths = .5)
<AxesSubplot:>
loans_numeric_2 = loans_numeric.iloc[:, 30:70]
corr2 = loans_numeric_2.corr()
mask = np.triu(np.ones_like(corr2, dtype = bool))
cmap = sns.diverging_palette(222, 222, as_cmap = True)
f, ax = plt.subplots(figsize = (11, 9))
sns.heatmap(corr2, mask = mask, cmap = cmap, center = 0, square = True, linewidths = .5)
<AxesSubplot:>
Aplicando el mismo criterio que con las individuales eliminaremos aquellas que nos den alta correlación.
En primer lugar observamos alta correlación entre num_bc_sats y num_sats, ambas dan información semejante sin embargo num_sats todavía aporta más, por lo que eliminaremos la primera. Sucede lo mismo entre las variables num_rev_accts y num_rev_tl_bsl_gt_0. Se eliminará la 2º ya que aporta más info. Ocurre igual entre num_rev_accts y num_op_rev_tl. Se eliminará la 2º por ofrecer menos información.
joint_descartes_3 = loans_joint.loc[:,['num_bc_sats', 'num_op_rev_tl', 'num_rev_tl_bal_gt_0']]
loans_joint = loans_joint.drop(joint_descartes_3, axis = 1)
loans_joint.to_csv('../data/loans_joint_clean.csv')